home *** CD-ROM | disk | FTP | other *** search
/ CD Actual 3 / CD ACTUAL 3.iso / linux / docs / linux-do / network- / nag-1.002 / nag-1 / bin / makewords < prev   
Encoding:
Text File  |  1994-06-03  |  2.9 KB  |  124 lines

  1. #!/bin/sh
  2. echo -n "Extracting all words from " >&2
  3. (
  4.     for f in $*; do
  5.         echo -n "$f " >&2
  6.         cat $f |
  7.             sed 's/\\%/ /g' |
  8.             sed 's/%.*//' |
  9.             sed 's/\\begin{[^}]*}//g' |
  10.             sed 's/\\end{[^}]*}//g' |
  11.             sed 's/\\ref{[^}]*}//g' |
  12.             sed 's/\\namedlabel{[^}]*}{[^}]*}/ /g' |
  13.             sed 's/\\label{[^}]*}//g' |
  14.             sed 's/\\index{[^}]*}//g' |
  15.             sed 's/\\[_$#\]/ /g' |
  16.             sed 's/\\[a-z]*/ /g' |
  17.             tr '{}()[]<>|\\,.:;@^%/~?!&$=*_#"`'"'-" " " |
  18.             tr '[A-Z]' '[a-z]' |
  19.             sed 's/[     ]/
  20. /g' |
  21.             grep -v '^$' | 
  22.             grep -v '^-' |
  23.             sort |
  24.             uniq
  25.     done 
  26.     echo >&2
  27. ) | sort | uniq > /tmp/words.all
  28.  
  29. # Define function for word counts
  30. function delta {
  31.     w1=`wc -l < $1`
  32.     w2=`wc -l < $2`
  33.     expr $w1 - $w2
  34. }
  35.  
  36. cp /tmp/words.all /tmp/words.0
  37. echo "Removing derivative words" >&2
  38.     echo -n "...numbers: "
  39.     grep -v '^[-0-9]*$' /tmp/words.all |
  40.        grep -v '^0x[0-9a-f]*$' > /tmp/w1.$$
  41.     delta /tmp/words.all /tmp/w1.$$
  42.     mv /tmp/w1.$$ /tmp/words.all
  43.  
  44.     echo -n "...ing: "
  45.     cat /tmp/words.all |
  46.         sed 'h
  47.              s/$/ing/p
  48.              g
  49.              s/.$/&&ing/p
  50.              g
  51.              s/ie$/ying/p
  52.              g
  53.              s/e$/ing/p
  54.              d' | sort > /tmp/w0.$$
  55.     comm -2 -3 /tmp/words.all /tmp/w0.$$ > /tmp/w1.$$
  56.     delta /tmp/words.all /tmp/w1.$$
  57.     mv /tmp/w1.$$ /tmp/words.all
  58.  
  59.     echo -n "...s: "
  60.     cat /tmp/words.all |
  61.         sed 'h
  62.              s/$/s/p
  63.              g
  64.              s/$/es/p
  65.              g
  66.              s/y$/ies/p
  67.              g
  68.              s/s$/ses/p
  69.              d' | sort > /tmp/w0.$$
  70.     comm -2 -3 /tmp/words.all /tmp/w0.$$ > /tmp/w1.$$
  71.     echo `delta /tmp/words.all /tmp/w1.$$`
  72.     mv /tmp/w1.$$ /tmp/words.all
  73.  
  74.     echo -n "...d: "
  75.     cat /tmp/words.all |
  76.         sed 'h
  77.              s/$/d/p
  78.              g
  79.              s/$/ed/p
  80.              g
  81.              s/.$/&&ed/p
  82.              g
  83.              s/y$/ied/p
  84.              d' | sort > /tmp/w0.$$
  85.     comm -2 -3 /tmp/words.all /tmp/w0.$$ > /tmp/w1.$$
  86.     echo `delta /tmp/words.all /tmp/w1.$$`
  87.     mv /tmp/w1.$$ /tmp/words.all
  88.  
  89.     echo -n "...ly: "
  90.     cat /tmp/words.all |
  91.         sed 'h
  92.              s/y$/ily/p
  93.              g
  94.              s/$/ly/p
  95.              g
  96.              s/ble/bly/p
  97.              d' | sort > /tmp/w0.$$
  98.     comm -2 -3 /tmp/words.all /tmp/w0.$$ > /tmp/w1.$$
  99.     delta /tmp/words.all /tmp/w1.$$
  100.     mv /tmp/w1.$$ /tmp/words.all
  101.  
  102.     echo -n "...ion: "
  103.     cat /tmp/words.all |
  104.         sed 'h
  105.              s/$/ion/p
  106.              g
  107.              s/te$/tion/p
  108.              g
  109.              s/de$/sion/p
  110.              g
  111.              s/te$/tions/p
  112.              g
  113.              s/de$/sions/p
  114.              d' | sort > /tmp/w0.$$
  115.     comm -2 -3 /tmp/words.all /tmp/w0.$$ > /tmp/w1.$$
  116.     delta /tmp/words.all /tmp/w1.$$
  117.     mv /tmp/w1.$$ /tmp/words.all
  118.  
  119.     rm -f /tmp/w0.$$ /tmp/w1.$$ 
  120.     mv /tmp/words.all words.all
  121.  
  122.     echo "extracting probable misspellings"
  123.     comm -23 words.all /usr/dict/words > words.bad
  124.